%this script generates the dailly running activity for each individual of consideration. 
%Input files: SPA_Social_graph.txt,App_Users_in_Graph_demographics.csv, USERREL.csv
%Output files:
%run_mat.mat,distance_mat.mat,duration_mat.mat,calories_mat.mat, pace_mat.mat, StartTime_mat.mat, TimeZone_mat.mat
%The size of matrices is (number of days) x (number of individuals). For instance
%distance_mat(1,1000) is the distance individual with user number=1000 run on the 1st day of consideration.
%StartTime_mat is the matrix that gives the local start time for the daily
%activity (eg. 10.5 means 10:30am) and TimeZone_mat gives the timezone for the daily activity for
%example -4 means GMT-4:00 and 0 means Greenwich time GMT+0:00


tic;SPA_Social_graph=dataset('File','SPA_Social_graph.txt','Delimiter','\t','format','%s%s%d%s%s%f%f%f%f%f%f%d%d%d'); toc; 
first_day_of_observations=datestr(min(SPA_Social_graph.datenumber));
last_day_of_observation=datestr(max(SPA_Social_graph.datenumber));


%%%%We isolate the "runs" from other sport activities. Activity Type ID = 2 refers to running.%%%
SPA_RUNS_Social_graph=SPA_Social_graph(find(SPA_Social_graph.ACTIVITY_TYPE_ID==2),:); 
%%%%%%%%%%%%%%%%%%%%%%%%%
App_Users_in_Graph_demographics=dataset('File','App_Users_in_Graph_demographics.txt');
s=num2str(App_Users_in_Graph_demographics.UPM_USER_ID);
s=cellstr(s);s=strtrim(s);
App_users_IDS=[];
App_users_IDS.USER_ID=App_Users_in_Graph_demographics.USER_ID;
App_users_IDS.UPM_USER_ID=s;
App_users_IDS=struct2dataset(App_users_IDS);
USERREL_USEDFOR_SOCIAL_INFLUENCE=dataset('File','USERREL.txt'); 
User_num_in_App_Users_in_Graph_demographics=[1:length(App_Users_in_Graph_demographics(:,1))]';


daynumbers_consideration=datenum(first_day_of_observation):datenum(last_day_of_observation);daynumbers_consideration=daynumbers_consideration';
l=find(daynumbers_consideration==min(USERREL_USEDFOR_SOCIAL_INFLUENCE.created_datenumber)); %first date that a link exists.
daynumbers_consideration=daynumbers_consideration(l:end); 

SPA_RUNS_Social_graph_in_dates_of_social_lnks=SPA_RUNS_Social_graph(find(SPA_RUNS_Social_graph.datenumbers>=daynumbers_consideration(1)),:);
size(SPA_RUNS_Social_graph_in_dates_of_social_lnks)


%making sure that everything in the time range of consideration
sum(datenumbers>daynumbers_consideration(end))
sum(datenumbers<daynumbers_consideration(1))


%the matrices that we are going to store the daily running activity of individuals of consideration
run_mat=zeros(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics));
distance_mat=zeros(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics)); %km
duration_mat=zeros(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics)); %minutes
calories_mat=zeros(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics));
StartTime_mat=NaN(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics));
TimeZone_mat=NaN(length(daynumbers_consideration),length(User_num_in_App_Users_in_Graph_demographics));

%fill the matrices by calculating the total running activity per person / per day 
for t=1:length(daynumbers_consideration)
events=find((SPA_RUNS_Social_graph_in_dates_of_social_lnks.datenumbers==daynumbers_consideration(t)));
individuals=SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events);
[unique_individuals,ia,ib]=unique(individuals);
L=histc(individuals,unique_individuals);
run_mat(t,unique_individuals)=L;
StartTime_mat(t,unique_individuals)=SPA_RUNS_Social_graph_in_dates_of_social_lnks.START_TIME_LOCAL(events(ia));
TimeZone_mat(t,unique_individuals)=SPA_RUNS_Social_graph_in_dates_of_social_lnks.TZ_offset(events(ia));
for i=1:length(events)
    distance_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))=distance_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))+SPA_Social_graph.DISTANCE(events(i));
    duration_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))=duration_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))+SPA_Social_graph.DURATION(events(i));
    calories_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))=calories_mat(t,SPA_RUNS_Social_graph_in_dates_of_social_lnks.USER_NUM(events(i)))+SPA_Social_graph.CALORIES(events(i));
end
if mod(t,10)==0
    t
end
end

%put NAN (Not A Value) to any day before the registration day (join date) for each individual.
join_date_num=datenum(App_Users_in_Graph_demographics.JOIN_DATE);
for i=1:length(run_mat(1,:))
    if join_date_num(i)>daynumbers_consideration(1)    
        j=find(daynumbers_consideration==join_date_num(i))-1;
    run_mat(1:j,i)=NaN;distance_mat(1:j,i)=NaN;duration_mat(1:j,i)=NaN; calories_mat(1:j,i)=NaN;StartTime_mat=NaN;TimeZone_mat=NaN;
    end
    if mod(i,10000)==0
        i
    end
end
%in the case that some data are wrong (eg. negative values)
run_mat(find(run_mat<0))=NaN;
distance_mat(find(distance_mat<0))=NaN;
duration_mat(find(duration_mat<0))=NaN;
calories_mat(find(calories_mat<0))=NaN;
duration_mat=duration_mat/1000/60; %in minutes
%generate the pace matrix
pace_mat=distance_mat;
pace_mat(find(pace_mat==0))=-1;
pace_mat=pace_mat./duration_mat; %in km/minute
pace_mat(find(pace_mat<0))=0;


%save the data matrices 
save run_mat.mat run_mat -v7.3
save distance_mat.mat distance_mat -v7.3
save duration_mat.mat duration_mat -v7.3
save calories_mat.mat calories_mat -v7.3
save pace_mat.mat pace_mat -v7.3
save StartTime_mat StartTime_mat -v7.3
save TimeZone_mat TimeZone_mat -v7.3
%The size of matrices is (number of days) x (number of individuals). For instance
%distance_mat(1,1000) is the distance individual with user number=1000 run on the first day of consideration.
